################################################################################################
rm(list=ls()) 
x=c("rgdal","rgeos","sp","geosphere","tidyr","broom","ggplot2")
wd="C:/Users/mg5797/Documents/Pollution/ReplicationPackage"

lapply(x, library, character.only=TRUE) # load the required packages
library(dplyr)
library(haven)
# plant by year file : 

setwd("Derived Data")

df=read_dta("plant_by_year_withloc.dta")


# recode nuclear to renewable:
df$fuel[df$fuel=="nuclear"]="renewable"


# create aggregate production: for earch year, and each fuel type, sum up total production
aggprod=df %>% group_by(year,fuel) %>% summarise(totalayprod=sum(AY_prod)) %>% ungroup()





############ Figure:  Total Coal Gas

totalcoal_gas=aggprod[aggprod$fuel %in% c("coal","gas"),] # only coal gas
totalcoal_gas=totalcoal_gas[totalcoal_gas$year>2008,] # look past 2008
totalcoal_gas$Fuel=totalcoal_gas$fuel
totalcoal_gas$ayprodscale=totalcoal_gas$totalayprod/1000000

ggplot(totalcoal_gas, aes(x=year, y=ayprodscale,linetype=Fuel)) + geom_line(aes(linetype=Fuel), size=1.25) + geom_vline(xintercept = 2011.5)+scale_y_continuous( breaks = seq(600, 1400, by=200),labels=c("600","800","1000","1200","1400"),limits=c(600,1400))  + scale_x_continuous( breaks = seq(2009, 2018, by=1),labels=c("08-09","09-10","10-11","11-12","12-13","13-14","14-15","15-16","16-17","17-18"))  + labs(x="Academic Year", y="Aggregate Production (One million MwH)", title="Total Coal and Natural Gas Production in U.S. : 2009 -2018") + theme_classic() + theme(axis.text=element_text(size=18),
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    axis.title=element_text(size=18,face="bold"),legend.position="bottom",legend.text = element_text(size = 20),text = element_text(size = 18))
ggsave("../output/FigureA1.pdf", width=11,height=10)  


########################################
########################################
# get lag for growth: 
aggprod=aggprod %>% group_by(fuel) %>% mutate(totalayprod_lag=dplyr::lag(totalayprod, order_by = year)) %>% ungroup()


# calculate percentage growth in fuel year-by-year
aggprod$growth=(aggprod$totalayprod-aggprod$totalayprod_lag)/aggprod$totalayprod_lag*100
aggprod=aggprod[,c("growth","fuel","year","totalayprod","totalayprod_lag")]

# split by fuel type
#coal:
aggcoal=aggprod[aggprod$fuel=="coal",];aggcoal$totalcoal=aggcoal$totalayprod; aggcoal$totalcoal_lag=aggcoal$totalayprod_lag; aggcoal$fuel=NULL;  aggcoal$growth=NULL; aggcoal$totalayprod=NULL; aggcoal$totalayprod_lag=NULL

#gas:
agggas=aggprod[aggprod$fuel=="gas",];agggas$totalgas=agggas$totalayprod; agggas$totalgas_lag=agggas$totalayprod_lag; agggas$fuel=NULL ; agggas$growth=NULL;  agggas$totalayprod=NULL ; agggas$totalayprod_lag=NULL

# renewable:
aggrenewable=aggprod[aggprod$fuel=="renewable",]; aggrenewable$totalrenewable=aggrenewable$totalayprod; aggrenewable$totalrenewable_lag=aggrenewable$totalayprod_lag;aggrenewable$fuel=NULL; aggrenewable$growth=NULL;  aggrenewable$totalayprod=NULL;  aggrenewable$totalayprod_lag=NULL

#oil
aggoil=aggprod[aggprod$fuel=="oil",];aggoil$totaloil=aggoil$totalayprod; aggoil$totaloil_lag=aggoil$totalayprod_lag; aggoil$fuel=NULL ; aggoil$growth=NULL; aggoil$totalayprod=NULL; aggoil$totalayprod_lag=NULL

# merge all in wide format:

finalfuelagg=merge(aggcoal,agggas,by="year"); finalfuelagg=merge(finalfuelagg,aggrenewable,by="year"); finalfuelagg=merge(finalfuelagg,aggoil,by="year")

rm(aggcoal,agggas,aggrenewable,aggoil)
#  keep all fuels

df=df[df$AY_prod>0,]


# drop biomass:
df=df[df$fuel!="biomass",]

#####################################################################
# subset out plants with lat /lon and those without:
dfmatched=df[df$X_merge=="matched",] # these plants have lat/lon

df_nomatch=df[df$X_merge=="master only",] # these plants do not have lat /lon from 2012


df_nomatch$Latitude=NULL; df_nomatch$Longitude=NULL


# checked
############################### Import coal plant closings file to get lat/lon ################ 

df_close=read.csv("../Raw Data/5. Electricity Data/eia_retirementlist_2019.csv", stringsAsFactors = FALSE, header=TRUE) # all plants that retired in 2019 or earlier

df_close=unique(df_close[,c("Plant.ID","Plant.State","Latitude","Longitude")]) # just get the lat/lon coordinates
# Plant State/Id are unique identifiers
# rename
df_close$PlantID=df_close$Plant.ID; df_close$Plant.ID=NULL
df_close$State=df_close$Plant.State; df_close$Plant.State=NULL

# merge in lat lon
dfmatchclose=merge(df_nomatch,df_close,by=c("PlantID","State"),all.x=TRUE) # checked

# Manually fill in for some missing

# Plant ID: 54243 = Brown Williamson Tobacco Coal Plant GA USA: http://globalenergyobservatory.org/geoid/217
dfmatchclose$Latitude[dfmatchclose$PlantID==54243 & dfmatchclose$State=="GA"]=32.8011; dfmatchclose$Longitude[dfmatchclose$PlantID==54243 & dfmatchclose$State=="GA"]=-83.693

# Plant ID: 62319  https://www.westernsugar.com/who-we-are/our-facilities/billings-mt-manufacturing-facility/
dfmatchclose$Latitude[dfmatchclose$PlantID==62319 & dfmatchclose$State=="MT"]=45.768820; dfmatchclose$Longitude[dfmatchclose$PlantID==62319 & dfmatchclose$State=="MT"]=-108.497260


# append 

dfcoalprod=rbind(dfmatched,dfmatchclose); rm(dfmatched, dfmatchclose, df_nomatch, df_close)


###################################################################### 

# geocode non matches:
dfcoalprod_match=dfcoalprod[!is.na(dfcoalprod$Latitude),] # these are matched: have a latitude coordinate

dfcoalprod_nomatch=dfcoalprod[is.na(dfcoalprod$Latitude),]; dfcoalprod_nomatch$Latitude=NULL; dfcoalprod_nomatch$Longitude=NULL

# plants to geocode:
plantgeocode=unique(dfcoalprod_nomatch[,c("PlantID","State")]) # just get plant id and state

# collect the 2007 plants (EIA 860 2007) : earliest one with street address
plant2007=read.csv("../Raw Data/5. Electricity Data/PlantY07.csv", stringsAsFactors = FALSE, header=TRUE)
plant2007$PlantID=plant2007$PLNTCODE; plant2007$State=plant2007$STATE

# merge in address info: 
plantgeocode=merge(plantgeocode,plant2007, by=c("PlantID","State"))
plantgeocode=plantgeocode[,c("PlantID","State","MAIL_STREET_ADDRESS","MAIL_CITY","ZIP5")] # only 30 plants to geocode 

# geocode:
library(censusxy)
plantgeocode_out <- cxy_geocode(plantgeocode, street = "MAIL_STREET_ADDRESS", city = "MAIL_CITY", state = "State",zip="ZIP5", 
                                output = "simple", class = "dataframe")

# manually fill in some missing ones through website for street address to lat/lon
plantgeocode_out$cxy_lat[plantgeocode_out$PlantID==10683 & plantgeocode_out$State=="CO"]=40.248510; plantgeocode_out$cxy_lon[plantgeocode_out$PlantID==10683 & plantgeocode_out$State=="CO"]=-103.623250

plantgeocode_out$cxy_lat[plantgeocode_out$PlantID==10851 & plantgeocode_out$State=="NJ"]=40.635140; plantgeocode_out$cxy_lon[plantgeocode_out$PlantID==10851 & plantgeocode_out$State=="NJ"]=-75.134350

plantgeocode_out$cxy_lat[plantgeocode_out$PlantID==3506 & plantgeocode_out$State=="TX"]=31.575850; plantgeocode_out$cxy_lon[plantgeocode_out$PlantID==3506 & plantgeocode_out$State=="TX"]=-96.964500

plantgeocode_out$cxy_lat[plantgeocode_out$PlantID==50168 & plantgeocode_out$State=="LA"]=30.184420; plantgeocode_out$cxy_lon[plantgeocode_out$PlantID==50168 & plantgeocode_out$State=="LA"]=-90.971750


# only keep geocoded plants:
plantgeocode_out=plantgeocode_out[!is.na(plantgeocode_out$cxy_lon),]
plantgeocode_out$Latitude=plantgeocode_out$cxy_lat; plantgeocode_out$Longitude=plantgeocode_out$cxy_lon
plantgeocode_out=plantgeocode_out[,c("PlantID","State","Latitude","Longitude")]

# merge:
dfcoalprod_nomatch_geo=merge(dfcoalprod_nomatch,plantgeocode_out, by=c("PlantID","State"))
dfcoalprod=rbind(dfcoalprod_match,dfcoalprod_nomatch_geo)

# just get the plants: 

dfcoal=unique(dfcoalprod[,c("PlantID","State","Latitude","Longitude")])
dfcoal=dfcoal[!is.na(dfcoal$Latitude) & !is.na(dfcoal$Longitude),] # dataframe of all producing plants with a geocode
# dfcoalprod: plant production by year
# dfcoal: plant locations
rm(dfcoalprod_match,dfcoalprod_nomatch,dfcoalprod_nomatch_geo,plant2007,plantgeocode,plantgeocode_out)
write.csv(dfcoalprod,"dfcoalprod.csv")

write.csv(finalfuelagg,"finalfuelagg.csv")
write.csv(aggprod,"aggprod.csv")

write.csv(dfcoal,"dfcoal.csv")

